import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS, cluster_optics_dbscan
from sklearn import metrics
from sklearn.datasets import make_blobs
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import statistics
df = pd.read_csv('../df.csv')
df
df.drop(['Unnamed: 0'], axis='columns', inplace=True)
df
# controllo il tipo di dato delle coordinate:
type(df['Coords'][1])
df['Coords'][1]
Creo un dataframe contenente soltanto le coordinate (lat e long unite):
df_coords = df['Coords']
df_coords
Creo il dataframe X che contiene soltanto la colonna della lat e della lon e poi normalizzo i dati per il clustering.
X = df[['Lat', 'Lon']]
X
X_norm = StandardScaler().fit_transform(X)
X_norm
OPTICS è un’evoluzione di dbscan e permette di non specificare il raggio.
# optics 1 clustering
print ('optics 1, min_samples=10, default metric=minkowski')
optics1 = OPTICS(min_samples=10).fit(X_norm) #default metric = minkowski
labels_opt1 = optics1.labels_
hist, bins = np.histogram(labels_opt1, bins=range(-1, len(set(labels_opt1)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_norm, labels_opt1))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
print ('% outliers', dict(zip(bins, hist))[-1]*100/sum(hist))
X_norm
# PLOT OPTICS 1
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy', 'yellow', 'purple', 'black', 'grey', 'red', 'brown']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_norm[:,0], X_norm[:,1], c=vectorizer(labels_opt1), marker="o", picker=True)
plt.title('Clustering\nOPTICS 1')
plt.show()
# optics 2 clustering
print ('optics 2, min_samples=50, xi=0.05, min_cluuster_size=0.05')
optics2 = OPTICS(min_samples=50, xi=0.05, min_cluster_size=0.05).fit(X_norm) #default metric = minkowski
labels_opt2 = optics2.labels_
hist, bins = np.histogram(labels_opt2, bins=range(-1, len(set(labels_opt2)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_norm, labels_opt2))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
print ('% outliers', dict(zip(bins, hist))[-1]*100/sum(hist))
# PLOT OPTICS 2
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_norm[:,0], X_norm[:,1], c=vectorizer(labels_opt2), marker="o", picker=True)
plt.title('Clustering\nOPTICS 2')
plt.show()
# optics 3 clustering
print ('optics 3, min_samples=10, metric=cityblock')
optics3 = OPTICS(min_samples=10, metric='cityblock').fit(X_norm)
labels_opt3 = optics3.labels_
hist, bins = np.histogram(labels_opt3, bins=range(-1, len(set(labels_opt3)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_norm, labels_opt3))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
print ('% outliers', dict(zip(bins, hist))[-1]*100/sum(hist))
# PLOT OPTICS 3
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy', 'yellow', 'purple', 'black', 'grey', 'red', 'brown']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_norm[:,0], X_norm[:,1], c=vectorizer(labels_opt3), marker="o", picker=True)
plt.title('Clustering\nOPTICS 3')
plt.show()
Continua a studiare il codice e a modificarlo per perfezionare i risultati! Di seguito:
from sklearn.cluster import OPTICS, cluster_optics_dbscan
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
labels_050 = cluster_optics_dbscan(
reachability=optics1.reachability_,
core_distances=optics1.core_distances_,
ordering=optics1.ordering_,
eps=0.5,
)
labels_100 = cluster_optics_dbscan(
reachability=optics1.reachability_,
core_distances=optics1.core_distances_,
ordering=optics1.ordering_,
eps=1,
)
#np.arange => returns evenly spaced values within a given interval.
#Values are generated within the half-open interval [start, stop)
#(in other words, the interval including start but excluding stop).
#For integer arguments the function is equivalent to the Python built-in range function,
#but returns an ndarray rather than a list.
space = np.arange(len(X_norm))
reachability = optics1.reachability_[optics1.ordering_]
labels_opt = optics1.labels_[optics1.ordering_]
plt.figure(figsize=(10, 10))
G = gridspec.GridSpec(2, 2)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, :])
# Reachability plot
colors = ["g.", "r.", "b.", "y.", "c."]
for klass, color in zip(range(0, 5), colors):
Xk = space[labels_opt == klass]
Rk = reachability[labels_opt == klass]
ax1.plot(Xk, Rk, color, alpha=0.3)
ax1.plot(space[labels_opt == -1], reachability[labels_opt == -1], "k.", alpha=0.3)
ax1.plot(space, np.full_like(space, 1.0, dtype=float), "k-", alpha=0.5)
ax1.plot(space, np.full_like(space, 0.5, dtype=float), "k-.", alpha=0.5)
ax1.set_ylabel("Reachability (epsilon distance)")
ax1.set_title("Reachability Plot")
# OPTICS
colors = ["g.", "r.", "b.", "y.", "c."]
for klass, color in zip(range(0, 5), colors):
Xk = X_norm[optics1.labels_ == klass]
ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
ax2.plot(X_norm[optics1.labels_ == -1, 0], X_norm[optics1.labels_ == -1, 1], "k+", alpha=0.1)
ax2.set_title("Automatic Clustering\nOPTICS")
X_optics1 = X.copy() # creo un nuovo dataframe per aggiungere le lables ottenute dal clustering con optics1
X_optics1
labels_optics1 = labels_opt1 #per praticità
X_optics1.insert(2, "Labels", labels_optics1, True)
X_optics1.sort_values(by=['Labels'])
X_optics1_cleaned = X_optics1.drop(X_optics1[X_optics1['Labels'] == -1].index)
X_optics1_cleaned.sort_values(by=['Labels'])
X_optics1_cleaned_norm = StandardScaler().fit_transform(X_optics1_cleaned)
X_optics1_cleaned_norm
# optics1 cleaned clustering
print ('optics 1 cleaned (without noise)')
optics1.fit(X_optics1_cleaned_norm)
labels_optics1_cleaned = optics1.labels_
hist, bins = np.histogram(labels_optics1_cleaned, bins=range(-1, len(set(labels_optics1_cleaned)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_optics1_cleaned_norm, labels_optics1_cleaned))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
# plot optics cleaned 1
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy', 'yellow', 'purple', 'black', 'grey', 'red']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_optics1_cleaned_norm[:,0], X_optics1_cleaned_norm[:,1], c=vectorizer(labels_optics1_cleaned))
plt.title('Clustering\nOPTICS 1 Cleaned')
plt.show()
import folium
from folium import plugins
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
lat = [43.7359, 43.6955]
lon = [10.4269, 10.3686]
lat_mean = np.mean(lat)
lon_mean = np.mean(lon)
lat, lng = (lat_mean, lon_mean)
map_clusters1 = folium.Map(location=[lat, lng], zoom_start=13.2)
# set color scheme for the clusters
x = np.arange(15)
ys = [i + x + (i*x)**2 for i in range(15)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors1 = []
for lat, lng, cluster in zip(X_optics1['Lat'], X_optics1['Lon'],
X_optics1['Labels']):
#label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
folium.vector_layers.CircleMarker(
[lat, lng],
radius=5,
#popup=label,
tooltip = 'Cluster ' + str(cluster),
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.9).add_to(map_clusters1)
print('Mappa con clustering Optics 1 with noise')
map_clusters1
map_clusters2 = folium.Map(location=[lat, lng], zoom_start=13.2)
# set color scheme for the clusters
x = np.arange(15)
ys = [i + x + (i*x)**2 for i in range(15)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors2 = []
for lat, lng, cluster in zip(X_optics1_cleaned['Lat'], X_optics1_cleaned['Lon'],
X_optics1_cleaned['Labels']):
#label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
folium.vector_layers.CircleMarker(
[lat, lng],
radius=5,
#popup=label,
tooltip = 'Cluster ' + str(cluster),
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.9).add_to(map_clusters2)
print('Mappa con clustering Optics 1 cleaned')
map_clusters2
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X_norm)
# .kneighbors finds the K-neighbors of a point and returns indices of and distances to the neighbors of each point.
distances, indices = nbrs.kneighbors(X_norm)
distances
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
Sembrerebbe essere intorno a eps=0.2
Utilizzo il knee method per trovare i parametri più adatti.
from scipy.spatial.distance import pdist, squareform
distance = pdist(X_norm, 'euclidean') #pair wise distances between observations in n-dimensional space
print (distance)
#squareform converts between condensed distance matrices and square distance matrices
distance = squareform(distance) #distance matrix given the vector distance
print()
print(distance)
k = 5
kth_distances = list()
for d in distance:
index_kth_distance = np.argsort(d)[k]
kth_distances.append(d[index_kth_distance])
plt.plot(range(0, len(kth_distances)), sorted(kth_distances))
plt.ylabel('dist from %sth neighbor' % k, fontsize=14)
plt.xlabel('sorted distances', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=14)
plt.show()
Sembrerebbe essere intorno a eps=0.2 o 0.3
# prova 1: density based clustering
print ('dbscan 1, eps 0.2')
dbscan1 = DBSCAN(eps=0.2, min_samples=5)
dbscan1.fit(X_norm)
labels_dbscan1 = dbscan1.labels_
hist, bins = np.histogram(labels_dbscan1, bins=range(-1, len(set(labels_dbscan1)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_norm, labels_dbscan1))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
print ('% outliers', dict(zip(bins, hist))[-1]*100/sum(hist))
# obtaining the centers of the clusters
centroids = dbscan1.centers_
# points array will be used to reach the index easy
points = np.empty((0,len(data[0])), float)
# distances will be used to calculate outliers
distances = np.empty((0,len(data[0])), float)
# getting points and distances
for i, center_elem in enumerate(centroids):
# cdist is used to calculate the distance between center and other points
distances = np.append(distances, cdist([center_elem],data[clusters == i], 'euclidean'))
points = np.append(points, data[clusters == i], axis=0)
# plot dbscan 1
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy', 'yellow', 'purple', 'black', 'grey']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_norm[:,0], X_norm[:,1], c=vectorizer(labels_dbscan1))
plt.title('Clustering\nDBSCAN 1')
plt.show()
# prova 2: density based clustering
print ('dbscan 2, eps 0.3')
dbscan2 = DBSCAN(eps=0.3, min_samples=5)
dbscan2.fit(X_norm)
labels_dbscan2 = dbscan2.labels_
hist, bins = np.histogram(labels_dbscan2, bins=range(-1, len(set(labels_dbscan2)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_norm, labels_dbscan2))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
print ('% outliers', dict(zip(bins, hist))[-1]*100/sum(hist))
# plot dbscan 2
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_norm[:,0], X_norm[:,1], c=vectorizer(labels_dbscan2))
plt.title('Clustering\nDBSCAN 2')
plt.show()
# prova 3: density based clustering
print ('dbscan 3, eps 0.2m min_samples=3')
dbscan3 = DBSCAN(eps=0.2, min_samples=3)
dbscan3.fit(X_norm)
labels_dbscan3 = dbscan3.labels_
hist, bins = np.histogram(labels_dbscan3, bins=range(-1, len(set(labels_dbscan3)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_norm, labels_dbscan3))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
print ('% outliers', dict(zip(bins, hist))[-1]*100/sum(hist))
# plot dbscan 3
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_norm[:,0], X_norm[:,1], c=vectorizer(labels_dbscan3))
plt.title('Clustering\nDBSCAN 3')
plt.show()
df_dbscan1 = df.copy()
df_dbscan1.insert(8, "Labels", labels_dbscan1, True)
#X_dbscan1.sort_values(by=['Labels'])
df_dbscan1
#saving the dataframe
df_dbscan1.to_csv('../df_dbscan.csv')
df_dbscan3 = df.copy()
df_dbscan3.insert(8, "Labels", labels_dbscan3, True)
#X_dbscan1.sort_values(by=['Labels'])
df_dbscan3
#saving the dataframe
df_dbscan3.to_csv('../df_dbscan3.csv')
X_dbscan1 = X.copy() # creo un nuovo dataframe per aggiungere le lables ottenute dal clustering con dbscan
X_dbscan1
X_dbscan1.insert(2, "Labels", labels_dbscan1, True)
X_dbscan1_cleaned = X_dbscan1.drop(X_dbscan1[X_dbscan1['Labels'] == -1].index)
X_dbscan1_cleaned.sort_values(by=['Labels'])
X_dbscan1_cleaned_norm = StandardScaler().fit_transform(X_dbscan1_cleaned)
X_dbscan1_cleaned_norm
# density based clustering
print ('dbscan 1 cleaned (without noise)')
dbscan1.fit(X_dbscan1_cleaned_norm)
labels_dbscan1_cleaned = dbscan1.labels_
hist, bins = np.histogram(labels_dbscan1_cleaned, bins=range(-1, len(set(labels_dbscan1_cleaned)) + 1))
print ('labels', dict(zip(bins, hist)))
print ('silhouette', metrics.silhouette_score(X_dbscan1_cleaned_norm, labels_dbscan1_cleaned))
print ('mean cluster dimension', statistics.mean(hist))
print ('median cluster dimension', statistics.median(hist))
# plot dbscan cleaned 1
colors = ['royalblue', 'maroon', 'forestgreen', 'mediumorchid', 'tan', 'deeppink', 'olive', 'goldenrod', 'lightcyan', 'navy', 'yellow', 'purple', 'black', 'grey', 'red']
vectorizer = np.vectorize(lambda x: colors[x % len(colors)])
plt.scatter(X_dbscan1_cleaned_norm[:,0], X_dbscan1_cleaned_norm[:,1], c=vectorizer(labels_dbscan1_cleaned))
plt.title('Clustering\nDBSCAN 1 Cleaned')
plt.show()
X_dbscan1_cleaned["Coords"] = "[" + X_dbscan1_cleaned["Lat"].astype(str) + " , " + X_dbscan1_cleaned["Lon"].astype(str) + "]"
X_dbscan1_cleaned
import folium
from folium import plugins
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
map_clusters3 = folium.Map(location=[lat, lng], zoom_start=13.2)
# set color scheme for the clusters
x = np.arange(15)
ys = [i + x + (i*x)**2 for i in range(15)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lng, cluster in zip(X_dbscan1['Lat'], X_dbscan1['Lon'],
X_dbscan1['Labels']):
#label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
folium.vector_layers.CircleMarker(
[lat, lng],
radius=5,
#popup=label,
tooltip = 'Cluster ' + str(cluster),
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.9).add_to(map_clusters3)
print('Mappa con clustering Dbscan 1 with noise')
map_clusters3
map_clusters4 = folium.Map(location=[lat, lng], zoom_start=13.2)
# set color scheme for the clusters
x = np.arange(15)
ys = [i + x + (i*x)**2 for i in range(15)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
# add markers to the map
markers_colors = []
for lat, lng, cluster in zip(X_dbscan1_cleaned['Lat'], X_dbscan1_cleaned['Lon'],
X_dbscan1_cleaned['Labels']):
#label = folium.Popup(str(city)+ ','+str(state) + '- Cluster ' + str(cluster), parse_html=True)
folium.vector_layers.CircleMarker(
[lat, lng],
radius=5,
#popup=label,
tooltip = 'Cluster ' + str(cluster),
color=rainbow[cluster-1],
fill=True,
fill_color=rainbow[cluster-1],
fill_opacity=0.9).add_to(map_clusters4)
print('Mappa con clustering Dbscan 1 cleaned')
map_clusters4
import pygeohash as pgh
import geohash as gh
import geopandas as gpd
from polygon_geohasher.polygon_geohasher import geohash_to_polygon
X_geohash = X.copy() # creo un nuovo dataframe per aggiungere le lables ottenute dal clustering con dbscan
X_geohash
X_geohash['geohash'] = X_geohash.apply(lambda x: gh.encode(x.Lat, x.Lon, precision=6), axis=1)
Conto i codici geohash per poi creare una mappa e creare la scala:
count_geohash = []
codes_geohash = X_geohash['geohash'].tolist()
codes_geohash.sort()
codes_geohash
values = []
for code_geohash in codes_geohash:
values.append(codes_geohash.count(code_geohash))
values_array = np.array(values)
values = set(values_array)
values
#per ora non uso l'array normalizzato!
norm = np.linalg.norm(values_array)
normal_array = values_array/norm
print(normal_array)
decoded_location = gh.decode(X_geohash['geohash'][0])
decoded_location2 = gh.decode(X_geohash['geohash'][1])
decoded_location
decoded_location2
gh.neighbors(X_geohash['geohash'][0])
gh.neighbors(X_geohash['geohash'][200])
Creo un dataframe per poi costruire la griglia con i codici geohash sulla mappa folium:
import json
# Create Geo Pandas DataFrame
df_geo = gpd.GeoDataFrame({'location':df_coords.tolist(), 'value': values_array})
df_geo['geohash'] = X_geohash['geohash']
df_geo['geometry'] = df_geo['geohash'].apply(geohash_to_polygon)
df_geo.crs = {'init': 'epsg:4326'}
print('features.properties.geohash')
display(json.loads(df_geo.to_json())['features'][0])
display(df_geo.head())
import folium
lat = [43.7359, 43.6955]
lon = [10.4269, 10.3686]
lat_mean = np.mean(lat)
lon_mean = np.mean(lon)
lat, lng = (lat_mean, lon_mean)
m = folium.Map((lat, lng), zoom_start=13.2)
folium.Choropleth(geo_data=df_geo,
name='choropleth',
data=df_geo,
columns=['geohash', 'value'],
key_on='feature.properties.geohash',
fill_color='YlGn',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='asdf').add_to(m)
m
df_geohash = df.copy()
df_geohash.insert(8, "geohash", X_geohash['geohash'] , True)
#df_geohash.sort_values(by=['Labels'])
df_geohash
#saving the dataframe
df_geohash.to_csv('../df_geohash.csv')
# raggruppo il df in base ai codici geohash per cambiare nome in numeri e renderli più comprensibili
Geohash_codes = df_geohash.groupby(['geohash'])
lista = []
i = 0
for key, items in Geohash_codes:
df_geohash['geohash'] = df_geohash['geohash'].replace(key, i)
i = i+1
df_geohash
#saving the dataframe
df_geohash.to_csv('../df_geohash_to_numbers.csv')